import warnings
warnings.filterwarnings("ignore")
import shutil
import os
import pandas as pd
import matplotlib
matplotlib.use(u'nbAgg')
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from sklearn.manifold import TSNE
from sklearn import preprocessing
import pandas as pd
from multiprocessing import Process# this is used for multithreading
import multiprocessing
import codecs# this is used for file operations
import random as r
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sqlite3
from tqdm import tqdm
import random
import threading
from multiprocessing import Process
# conn = sqlite3.connect('malware.db',check_same_thread=False)
# c=conn.cursor()
# conn.close()
def create_connection():
conn = sqlite3.connect('malware.db',check_same_thread=False)
return(conn)
######create table id_class
with create_connection() as conn:
c=conn.cursor()
c.execute('CREATE TABLE IF NOT EXISTS id_Class(id TEXT QNIQUE PRIMARY KEY,Class INTEGER )')
Y=pd.read_csv("trainLabels.csv")
Y.to_sql('id_class',conn,if_exists='replace', index = False)
######class counts in dataset################
ax = sns.countplot(x="Class", data=Y)
plt.show()
################percentage distribution of classes
total = len(Y)*1.
ax=sns.countplot(x="Class", data=Y)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
#put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
ax.yaxis.set_ticks(np.linspace(0, total, 11))
#adjust the ticklabel to the desired format, without changing the position of the ticks.
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
plt.show()
######create table id_size
with create_connection() as conn:
c=conn.cursor()
c.execute('CREATE TABLE IF NOT EXISTS id_size(id TEXT QINIQUE PRIMARY KEY,size REAL)')
for f in Y['Id']:
s=(os.path.getsize('./byteFiles/'+f+'.bytes'))/(1024*1024) ##convert the size in to MB
sm=round(s,2)
t=(f,sm)
c.execute("INSERT OR REPLACE INTO id_size VALUES(?,?)",t)
conn.commit()
df1=pd.read_sql_query('SELECT a.Id,a.Class, b.size FROM id_Class a INNER JOIN id_size b ON a.id=b.id',conn)
df1.head()
ax = sns.boxplot(x="Class", y="size", data=df1)
plt.title("boxplot of .bytes file sizes")
plt.show()
#######generate names of all possible 1-grams
h1=[]
a=['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F']
for i in a:
for j in a:
h1.append(i+j)
h1.append("??")
h2=['"'+k+'"' for k in h1]
#######creating table of one gram "id_ngram"
with create_connection() as conn:
c=conn.cursor()
columns = ", ".join(" {fld} INTEGER ".format(fld=field) for field in h2)
c.execute("""
CREATE TABLE IF NOT EXISTS id_ngram(
Id TEXT PRIMARY KEY UNIQUE,
{cls}
)
""".format(cls=columns))
####take care of sql injection
def getString(cnt):
result = '?'
for i in range(1,cnt):
result += ', ?'
return result
%%time
##########Dataset creation== single thread== id_ngram in to database#####################################singlethread##########
for f in tqdm(Y['Id']):
d={k:0 for k in h1}
#print(d)
fh=open('byteFiles/'+f+'.bytes','+r')
for line in fh:
#print(line.split())
#d1={k:int(d[i])+1 for i in line.split() if len(i)==2}
for i in line.split():
if len(i)==2:
#print(i)
d[i]=int(d[i])+1
#record=[f] + d.values
t=tuple([f]+list(d.values()))
exe = 'INSERT OR REPLACE INTO id_ngram VALUES ('+getString(258)+')'
c.execute(exe, t)
#print(d)
#fh.close()
#conn.commit()
#conn.close()
#######sample of id_ngram
df=pd.read_sql_query("SELECT * FROM id_ngram",conn)
print(df.shape)
df.head()
##################################Multithreading####################################
######dont run it ####one time job
#####distribute data to n folders
def move_to_n_folders(n):
for i in range(n):
if not os.path.isdir(i):
os.makedirs(str(i))
for f in Y['Id']:
r=random.randrange(1,n)
shutil.copyfile("./byteFiles/"+f+".bytes" , "./"+str(r)+"/"+f+".bytes" )
######creation of dataset id_ngram and writing to DB in multiple threads
def create_n_threads(n=0):
#conn = sqlite3.connect('malware.db',check_same_thread=False)
with create_connection() as conn:
c=conn.cursor()
for f in os.listdir("./"+str(n)):
d={k:0 for k in h1}
#print(d)
fh=open('./'+str(n)+'/'+f,'+r')
for line in fh:
#print(line.split())
#d1={k:int(d[i])+1 for i in line.split() if len(i)==2}
for i in line.split():
if len(i)==2:
#print(i)
d[i]=int(d[i])+1
#record=[f] + d.values
t=tuple([f.split(".")[0]]+list(d.values()))
exe = 'INSERT OR REPLACE INTO id_ngram_sample VALUES ('+getString(258)+')'
c=conn.cursor()
c.execute(exe, t)
conn.commit()
fh.close()
##Multithreading code
# %%time
# n=30 #######number of cores
# t=[None] * n
# for i in range(n):
# t[i] = threading.Thread(target=create_n_threads, args=(i,))
# t[i].start()
# for i in range(n):
# t[i].join()
# print("completed")
###########################Multi processing code########################
n=28
if __name__ == "__main__": # confirms that the code is under main function
procs = []
proc = Process(target=create_n_threads) # instantiating without any argument
procs.append(proc)
proc.start()
# instantiating process with arguments
for name in range(1,n):
# print(name)
proc = Process(target=create_n_threads, args=(name,))
procs.append(proc)
proc.start()
# complete the processes
for proc in procs:
proc.join()
######################merging the three tables create final dataset and normalize it
ds=pd.read_sql_query("SELECT a.* , b.size , c.Class FROM id_ngram a INNER JOIN id_size b ON a.Id=b.id INNER JOIN id_Class c ON a.Id=c.id ",conn)
data_y=ds['Class']
ds.head()
######normalize above table
def normalize(df):
result1 = df.copy()
for feature_name in df.columns:
if (str(feature_name) != str('Id') and str(feature_name)!=str('Class')):
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result1
ds_n = normalize(ds)
###############After normalization table looks like
ds_n.head()
#########Multivariate analysis of datset using T-SNE
xtsne=TSNE(perplexity=50)
dims=xtsne.fit_transform(ds_n.drop(['Id','Class'], axis=1))
vis_x = dims[:, 0] ####first priniciple component
vis_y = dims[:, 1] #####second principle component
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()
#xtsne=TSNE(perplexity=20)
xtsne = TSNE(n_components=2, verbose=1, perplexity=30)
dims=xtsne.fit_transform(ds_n.drop(['Id','Class'], axis=1))
vis_x = dims[:, 0] ####first priniciple component
vis_y = dims[:, 1] #####second principle component
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()
xtsne=TSNE(perplexity=100)
dims=xtsne.fit_transform(ds_n.drop(['Id','Class'], axis=1))
vis_x = dims[:, 0] ####first priniciple component
vis_y = dims[:, 1] #####second principle component
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()
###############Test Train split
data_y = ds_n['Class']
# split the data into test and train by maintaining same distribution of output varaible 'y_true' [stratify=y_true]
X_train, X_test, y_train, y_test = train_test_split(ds_n.drop(['Id','Class'], axis=1), data_y,stratify=data_y,test_size=0.25) #stratify maintains same proportion/classis ratio acroos the splits
# split the train data into train and cross validation by maintaining same distribution of output varaible 'y_train' [stratify=y_train]
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train,stratify=y_train,test_size=0.25)
print('Number of data points in train data:', X_train.shape[0])
print('Number of data points in test data:', X_test.shape[0])
print('Number of data points in cross validation data:', X_cv.shape[0])
#######distribution of data points in train dataset
df_y_train=pd.DataFrame({'id':y_train.index, 'Class':y_train.values})
sns.set(style="darkgrid")
ax = sns.countplot(x="Class", data=df_y_train)
plt.title('Class counts in Train dataset')
plt.show()
Y=df_y_train
total = len(Y)*1.
ax=sns.countplot(x="Class", data=Y)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
#put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
ax.yaxis.set_ticks(np.linspace(0, total, 11))
#adjust the ticklabel to the desired format, without changing the position of the ticks.
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
plt.title("Class distribution in Train dataset")
plt.show()
df_y_test=pd.DataFrame({'id':y_test.index, 'Class':y_test.values})
sns.set(style="darkgrid")
ax = sns.countplot(x="Class", data=df_y_test)
plt.title('Class counts in Test dataset')
plt.show()
Y=df_y_test
total = len(Y)*1.
ax=sns.countplot(x="Class", data=Y)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
#put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
ax.yaxis.set_ticks(np.linspace(0, total, 11))
#adjust the ticklabel to the desired format, without changing the position of the ticks.
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
plt.title("Class distribution in Test dataset")
plt.show()
df_y_cv=pd.DataFrame({'id':y_cv.index, 'Class':y_cv.values})
sns.set(style="darkgrid")
ax = sns.countplot(x="Class", data=df_y_train)
plt.title('Class counts in cv dataset')
plt.show()
Y=df_y_cv
total = len(Y)*1.
ax=sns.countplot(x="Class", data=Y)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
#put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
ax.yaxis.set_ticks(np.linspace(0, total, 11))
#adjust the ticklabel to the desired format, without changing the position of the ticks.
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
plt.title("Class distribution in cv dataset")
plt.show()
####################ML Models############################################
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
A =(((C.T)/(C.sum(axis=1))).T)
B =(C/C.sum(axis=0))
labels = [1,2,3,4,5,6,7,8,9]
cmap=sns.light_palette("blue")
# representing A in heatmap format
print("="*50, "Confusion matrix", "="*50)
plt.figure(figsize=(10,5))
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("-"*50, "Precision matrix", "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("Sum of columns in precision matrix",B.sum(axis=0))
# representing B in heatmap format
print("-"*50, "Recall matrix" , "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("Sum of rows in precision matrix",A.sum(axis=1))
###############################KNN
alpha = [x for x in range(1, 15, 2)]
cv_log_error_array=[]
for i in alpha:
k_cfl=KNeighborsClassifier(n_neighbors=i)
k_cfl.fit(X_train,y_train)
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)
predict_y = sig_clf.predict_proba(X_cv)
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=k_cfl.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for k = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
k_cfl=KNeighborsClassifier(n_neighbors=alpha[best_alpha])
k_cfl.fit(X_train,y_train)
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)
predict_y = sig_clf.predict_proba(X_train)
print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
predict_y = sig_clf.predict_proba(X_cv)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
plot_confusion_matrix(y_test, sig_clf.predict(X_test))
########
alpha=[10,50,100,500,1000,2000,3000]
cv_log_error_array=[]
train_log_error_array=[]
from sklearn.ensemble import RandomForestClassifier
for i in alpha:
r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1)
r_cfl.fit(X_train,y_train)
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)
predict_y = sig_clf.predict_proba(X_cv)
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=r_cfl.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1)
r_cfl.fit(X_train,y_train)
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)
predict_y = sig_clf.predict_proba(X_train)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
predict_y = sig_clf.predict_proba(X_cv)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
plot_confusion_matrix(y_test, sig_clf.predict(X_test))
#####################Gradient boosting(XGBOOST)
alpha=[10,50,100,500,1000,2000]
cv_log_error_array=[]
for i in alpha:
x_cfl=XGBClassifier(n_estimators=i,nthread=-1)
x_cfl.fit(X_train,y_train)
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)
predict_y = sig_clf.predict_proba(X_cv)
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=x_cfl.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
x_cfl=XGBClassifier(n_estimators=alpha[best_alpha],nthread=-1)
x_cfl.fit(X_train,y_train)
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)
predict_y = sig_clf.predict_proba(X_train)
print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
predict_y = sig_clf.predict_proba(X_cv)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
plot_confusion_matrix(y_test, sig_clf.predict(X_test))